# function for fixing legend
make_nice_effect_quantiles <- function(df, invar, outvar) {
mutate(
df,
{{ outvar }} := recode({{ invar }},
effect_20 = "Effect at 20% quantile",
effect_50 = "Effect at 50% quantile",
effect_80 = "Effect at 80% quantile")
)
}
effect_scale <- list(
scale_color_manual(values = c(
"Effect at 20% quantile" = colorspace::lighten("#007FA8", .7),
"Effect at 50% quantile" = colorspace::lighten("#007FA8", .4),
"Effect at 80% quantile" = "#007FA8"
))
)
hm <- read_rds("final_models/17-brm-large-sample.rds.bz2")
hm_simple <- brm(file = "final_models/hm_final_rerun.rds", file_refit = "never")
In this last round, the sampler issued warnings about hitting the maximum treedepth:
Warning: 4000 of 4000 (100.0%) transitions hit the maximum treedepth limit of 10.
This was to be expected, given the model took about twice as long to run as in previous iterations. The only aspects that changed were: revision of weights and resampling of data. Given that we had a slightly smaller sample size, it could be that some peculiarities of the specific sample led to inefficient sampling. Inference is likely to be unaffected.
pp_check(hm, ndraws = 10, cores = mc_cores) +
coord_cartesian(xlim = c(0, 8000))
pred_vis <- function(df, model, country_selection,
var_for_offset = base$P_top10, alpha = 1, ndraws = 1000) {
scale_offset <- attributes(var_for_offset)[["scaled:center"]]
get_back <- function(df) mutate(df, P_top10 = exp(scale_offset + P_top10))
df %>%
filter(country == country_selection) %>%
modelr::data_grid(P_top10, country, field) %>%
add_predicted_draws(model, ndraws = ndraws, re_formula = NULL,
cores = mc_cores) %>%
get_back() %>%
ggplot(aes(P_top10, .prediction)) +
stat_interval() +
scale_color_manual(values = colorspace::lighten(clrs[4], c(.8, .67, .42))) +
scale_y_continuous(labels = dollar) +
geom_jitter(aes(y = APC_in_dollar), alpha = alpha,
position = position_jitter(width = 5, height = 50),
data = filter(df, country == country_selection) %>% get_back()) +
facet_wrap(vars(field)) +
labs(y = "Predicted vs. actual APC", x = expression(P["top 10%"]),
color = "Credible interval") +
# theme_minimal(base_family = "Hind") +
theme_clean() +
theme(legend.position = "bottom", panel.grid.minor = element_blank())
}
pred_vis(base, hm, "Austria", alpha = .5)
pred_vis(base, hm, "Brazil", alpha = .2)
This updated model fares much better for Brazil. The predictions are
still not ideal (underestimating higher APCs of ~2000), but overall much
better than the previous model.
pred_vis(base, hm, "China", alpha = .15)
pred_vis(base, hm, "United States", alpha = .2)
pred_vis(base, hm, "Turkey", alpha = .7)
pp_check(hm_simple, ndraws = 10, cores = mc_cores) +
coord_cartesian(xlim = c(0, 8000))
pred_vis(base, hm_simple, "Brazil", alpha = .2)
summary(hm)
## Family: mixture(hurdle_lognormal, hurdle_lognormal)
## Links: mu1 = identity; sigma1 = identity; hu1 = logit; mu2 = identity; sigma2 = identity; hu2 = logit; theta1 = identity; theta2 = identity
## Formula: APC_in_dollar | weights(total_weight) ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
## hu1 ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
## hu2 ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
## theta1 ~ 1 + (1 | field)
## Data: base (Number of observations: 179710)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Group-Level Effects:
## ~country (Number of levels: 69)
## Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(mu1_Intercept) 0.46 0.05 0.38 0.56 1.01
## sd(mu1_P_top10) 0.15 0.02 0.11 0.20 1.00
## sd(mu2_Intercept) 0.03 0.00 0.02 0.04 1.00
## sd(mu2_P_top10) 0.01 0.00 0.01 0.02 1.00
## sd(hu1_Intercept) 1.39 0.20 1.05 1.82 1.00
## sd(hu1_P_top10) 0.48 0.14 0.22 0.79 1.01
## sd(hu2_Intercept) 1.86 0.20 1.51 2.28 1.00
## sd(hu2_P_top10) 0.65 0.10 0.48 0.87 1.00
## cor(mu1_Intercept,mu1_P_top10) -0.18 0.16 -0.49 0.15 1.00
## cor(mu2_Intercept,mu2_P_top10) -0.36 0.19 -0.66 0.06 1.00
## cor(hu1_Intercept,hu1_P_top10) -0.35 0.21 -0.70 0.10 1.00
## cor(hu2_Intercept,hu2_P_top10) 0.20 0.17 -0.13 0.51 1.00
## Bulk_ESS Tail_ESS
## sd(mu1_Intercept) 821 1742
## sd(mu1_P_top10) 1445 2500
## sd(mu2_Intercept) 1273 2058
## sd(mu2_P_top10) 1363 2001
## sd(hu1_Intercept) 1297 2058
## sd(hu1_P_top10) 765 948
## sd(hu2_Intercept) 941 2053
## sd(hu2_P_top10) 1255 2314
## cor(mu1_Intercept,mu1_P_top10) 1343 1993
## cor(mu2_Intercept,mu2_P_top10) 1990 2500
## cor(hu1_Intercept,hu1_P_top10) 1878 2605
## cor(hu2_Intercept,hu2_P_top10) 1402 1818
##
## ~field (Number of levels: 19)
## Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(mu1_Intercept) 0.30 0.06 0.21 0.42 1.00
## sd(mu1_P_top10) 0.10 0.03 0.06 0.16 1.00
## sd(mu2_Intercept) 0.18 0.03 0.13 0.26 1.00
## sd(mu2_P_top10) 0.02 0.01 0.01 0.03 1.00
## sd(hu1_Intercept) 1.69 0.25 1.26 2.27 1.00
## sd(hu1_P_top10) 0.24 0.05 0.15 0.36 1.00
## sd(hu2_Intercept) 1.88 0.32 1.37 2.59 1.00
## sd(hu2_P_top10) 0.37 0.08 0.25 0.55 1.00
## sd(theta1_Intercept) 1.12 0.25 0.70 1.67 1.00
## cor(mu1_Intercept,mu1_P_top10) 0.34 0.20 -0.10 0.68 1.00
## cor(mu2_Intercept,mu2_P_top10) -0.25 0.26 -0.70 0.30 1.00
## cor(hu1_Intercept,hu1_P_top10) -0.38 0.20 -0.72 0.04 1.00
## cor(hu2_Intercept,hu2_P_top10) 0.21 0.22 -0.23 0.60 1.00
## Bulk_ESS Tail_ESS
## sd(mu1_Intercept) 1359 2258
## sd(mu1_P_top10) 1411 2328
## sd(mu2_Intercept) 1235 2105
## sd(mu2_P_top10) 1324 2516
## sd(hu1_Intercept) 1407 2147
## sd(hu1_P_top10) 1879 2609
## sd(hu2_Intercept) 1175 2101
## sd(hu2_P_top10) 1569 2636
## sd(theta1_Intercept) 850 1212
## cor(mu1_Intercept,mu1_P_top10) 2013 2483
## cor(mu2_Intercept,mu2_P_top10) 2904 2786
## cor(hu1_Intercept,hu1_P_top10) 2703 2679
## cor(hu2_Intercept,hu2_P_top10) 2358 2879
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## mu1_Intercept 6.85 0.09 6.66 7.02 1.01 403 928
## hu1_Intercept -0.90 0.36 -1.64 -0.25 1.00 640 1286
## mu2_Intercept 7.57 0.04 7.49 7.66 1.00 669 1237
## hu2_Intercept -0.14 0.42 -0.89 0.75 1.00 727 1291
## theta1_Intercept -0.18 0.27 -0.67 0.36 1.01 554 945
## mu1_P_top10 0.13 0.04 0.05 0.21 1.00 1364 2004
## hu1_P_top10 0.09 0.12 -0.14 0.35 1.00 1555 2421
## mu2_P_top10 0.00 0.01 -0.01 0.02 1.00 1570 2168
## hu2_P_top10 -0.13 0.15 -0.41 0.17 1.01 979 2054
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma1 0.80 0.00 0.79 0.80 1.00 9144 2683
## sigma2 0.20 0.00 0.20 0.20 1.00 8676 3341
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Here we compute marginal effects manually, by making predictions for a given x (P_top10) and then the same x * 1.01, i.e., increasing x (on the original scale) by 1%, and then comparing the predictions.
scale_offset <- attributes(base$P_top10)[["scaled:center"]]
x1_identity <- 1500
x2_identity <- x1_identity * 1.1
x1_log <- log(x1_identity) - scale_offset
x2_log <- log(x2_identity) - scale_offset
contrast <- predictions(
hm,
newdata = datagrid(P_top10 = c(x1_log, x2_log),
country = "Brazil", field = unique(base$field))) %>%
posteriordraws()
contrast_recomputed <- contrast %>%
mutate(x = factor(P_top10, labels = c("base", "step"))) %>%
pivot_wider(-c(predicted, conf.low, conf.high, P_top10, rowid),
names_from = x, values_from = draw) %>%
mutate(contrast = step / base - 1)
contrast_recomputed %>%
ggplot(aes(contrast, fct_reorder(field, contrast))) +
stat_halfeye() +
scale_x_continuous(labels = percent)
This is very sensitive to the respective country. Maybe we can recompute an average marginal effect after all?
average_draws <- function(model, orig_var, q = .5,
var_for_offset = base$P_top10) {
scale_offset <- attributes(var_for_offset)[["scaled:center"]]
x1_identity <- quantile(orig_var, q)
x2_identity <- x1_identity * 1.01
x1_log <- log(x1_identity) - scale_offset
x2_log <- log(x2_identity) - scale_offset
contrast_all <- predictions(
model,
newdata = datagrid(P_top10 = c(x1_log, x2_log),
country = unique(base$country),
field = unique(base$field))) %>%
posteriordraws()
contrast_all %>%
mutate(x = factor(P_top10, labels = c("base", "step"))) %>%
pivot_wider(-c(predicted, conf.low, conf.high, P_top10, rowid),
names_from = x, values_from = draw) %>%
mutate(contrast = step / base - 1)
}
summarise_by <- function(contrast_df, var = field) {
contrast_df %>%
group_by({{ var }}, drawid) %>%
summarise(effect = mean(contrast))
}
plot_effect <- function(contrast_df, location = "the median") {
contrast_df %>%
ggplot(aes(effect, fct_reorder(field, effect))) +
stat_halfeye(.width = c(.5, .9), point_interval = "median_hdi") +
scale_x_continuous(labels = percent) +
labs(
y = NULL,
x = glue::glue("% change of APC for 1% change of P_top10% at {location}"),
caption = "Averaged predictions over all countries.") +
theme_clean() +
coord_cartesian(xlim = c(-0.005, 0.005))
}
contrast_20 <- average_draws(hm, df$P_top10, q = .2)
contrast_50 <- average_draws(hm, df$P_top10, q = .5)
contrast_80 <- average_draws(hm, df$P_top10, q = .8)
contrast_20_field <- summarise_by(contrast_20, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_20_field %>%
plot_effect("the 20% quantile")
This seems reasonable, but would need to further validate.
At 50%
contrast_50_field <- summarise_by(contrast_50, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_50_field %>%
plot_effect()
contrast_80_field <- summarise_by(contrast_80, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_80_field %>%
plot_effect()
Compare all three
all_joined <- bind_rows(
rename(contrast_50_field, effect_50 = effect),
rename(contrast_80_field, effect_80 = effect),
rename(contrast_20_field, effect_20 = effect)
)
p <- all_joined %>%
pivot_longer(contains("effect"), values_to = "effect") %>%
drop_na() %>%
make_nice_effect_quantiles(name, name) %>%
ggplot(aes(effect, fct_reorder(field, effect), colour = name)) +
geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
stat_pointinterval(position = position_dodge(width = .5),
.width = c(.5, .9)) +
scale_x_continuous(labels = percent) +
labs(
y = NULL,
x = expression(paste("% change of APC for 1% change of ", P["top 10%"],
" at given quantiles")),
caption = "Predictions averaged over all countries.") +
theme_clean() +
coord_cartesian(xlim = c(-0.005, 0.005)) +
guides(colour = guide_legend(reverse = FALSE))
p +
effect_scale +
theme(legend.position = "top", legend.justification = c(1, 0)) +
labs(colour = NULL)
Questions that arise: - why in some fields stronger/weaker effect for larger/smaller P_top10? Is this also associated with hurdle component? - Especially: why physics and mathematics negative? because of hurdle? -> Yes
Need to put into context of overall averages: give average per field (from model or full set)
contrast_50 %>%
group_by(field, drawid) %>%
summarise(intercept = mean(base)) %>%
ggplot(aes(intercept, fct_reorder(field, intercept))) +
stat_halfeye(.width = c(.5, .9), fill = colorspace::lighten("#007FA8"),
point_interval = "median_hdi") +
scale_x_continuous(labels = scales::dollar) +
theme_clean() +
labs(y = NULL, x = expression(paste("Estimated APC at median of ",
P["top 10%"])))
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_20_country <- summarise_by(contrast_20, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
contrast_50_country <- summarise_by(contrast_50, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
contrast_80_country <- summarise_by(contrast_80, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
all_countries <- bind_rows(
rename(contrast_20_country, effect_20 = effect),
rename(contrast_50_country, effect_50 = effect),
rename(contrast_80_country, effect_80 = effect),
) %>%
pivot_longer(contains("effect"), values_to = "effect") %>%
drop_na()
p_country <- all_countries %>%
ggplot(aes(effect, fct_reorder(country, effect), colour = name)) +
geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
stat_pointinterval(position = position_dodge(width = .5),
.width = c(.5, .9), point_interval = "median_hdi") +
scale_x_continuous(labels = percent) +
labs(
y = NULL,
x = glue::glue("% Change of APC for 1% change of P_top10% at given quantiles"),
caption = "Averaged predictions over all countries.") +
theme_clean() +
coord_cartesian(xlim = c(-0.001, 0.005)) +
guides(colour = guide_legend(reverse = TRUE))
p_country + scale_color_manual(values = c(
effect_20 = colorspace::lighten("#007FA8", .7),
effect_50 = colorspace::lighten("#007FA8", .4),
effect_80 = "#007FA8"
))
country_identifier <- base %>%
distinct(country, region, country_code) %>%
drop_na()
all_countries_w_region <- all_countries %>%
left_join(country_identifier)
## Joining, by = "country"
plot_countries_by_region <- function(df) {
df %>%
make_nice_effect_quantiles(name, color_label) %>%
ggplot(aes(effect, fct_reorder(country, effect), colour = color_label)) +
geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
stat_pointinterval(position = position_dodge(width = .5),
.width = c(.5, .9), point_interval = "median_hdi") +
scale_x_continuous(labels = percent) +
labs(
y = NULL,
x = expression(paste("% Change of APC for 1% change of ", P["top 10%"],
" at given quantiles")),
caption = "Predictions averaged over all fields.",
colour = NULL) +
theme_clean() +
facet_grid(rows = vars(str_wrap(region, 9)), space = "free_y", scales = "free_y") +
# coord_cartesian(xlim = c(-0.001, 0.005)) +
guides(colour = guide_legend(reverse = FALSE, override.aes = list(size = 2))) +
theme(legend.position = "top") +
effect_scale
}
p_europe_subsahara <- all_countries_w_region %>%
filter(str_detect(region, "Europe|Sahara")) %>%
plot_countries_by_region()
p_rest <- all_countries_w_region %>%
filter(!str_detect(region, "Europe|Sahara")) %>%
plot_countries_by_region()
p_europe_subsahara
p_rest
Relate to gdp
gdp <- WDI::WDI(start = 2019, end = 2019)
country_effect_with_gdp <- all_countries_w_region %>%
left_join(gdp, by = c("country_code" = "iso2c"))
country_effect_with_gdp
## # A tibble: 828,000 × 10
## country.x drawid name effect region count…¹ count…² iso3c year NY.GD…³
## <chr> <fct> <chr> <dbl> <chr> <chr> <chr> <chr> <int> <dbl>
## 1 Algeria 1 effect_… -1.09e-3 Middl… DZ Algeria DZA 2019 4115.
## 2 Algeria 2 effect_… 4.52e-3 Middl… DZ Algeria DZA 2019 4115.
## 3 Algeria 3 effect_… 1.11e-3 Middl… DZ Algeria DZA 2019 4115.
## 4 Algeria 4 effect_… 3.03e-3 Middl… DZ Algeria DZA 2019 4115.
## 5 Algeria 5 effect_… 1.16e-3 Middl… DZ Algeria DZA 2019 4115.
## 6 Algeria 6 effect_… 3.13e-3 Middl… DZ Algeria DZA 2019 4115.
## 7 Algeria 7 effect_… 9.06e-4 Middl… DZ Algeria DZA 2019 4115.
## 8 Algeria 8 effect_… 7.61e-6 Middl… DZ Algeria DZA 2019 4115.
## 9 Algeria 9 effect_… 4.60e-3 Middl… DZ Algeria DZA 2019 4115.
## 10 Algeria 10 effect_… -1.17e-3 Middl… DZ Algeria DZA 2019 4115.
## # … with 827,990 more rows, and abbreviated variable names ¹country_code,
## # ²country.y, ³NY.GDP.PCAP.KD
p <- country_effect_with_gdp %>%
filter(name == "effect_50") %>%
rename(gdp = NY.GDP.PCAP.KD) %>%
group_by(country_code, country.x, gdp, region) %>%
point_interval(effect, .width = .5, .point = median, .interval = hdi) %>%
ggplot(aes(gdp, effect, colour = region, label = country.x)) +
geom_pointrange(aes(ymin = .lower, ymax = .upper)) +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::comma) +
scale_color_discrete_qualitative(palette = "Dark 3") +
labs(colour = NULL, x = "GDP per capita",
y = "% increase in APC for 1% increase of P_top10 at the median") +
theme_clean() +
theme(legend.position = "top")
#coord_cartesian(ylim = c(0, .003))
p
## Warning: Removed 1 rows containing missing values (geom_pointrange).
plotly::ggplotly(p)
This could still be improved by adding number of universities (or papers) as a size variable.
unis_p_country <- df %>%
distinct(country, University) %>%
count(country, name = "n_universities")
pdata <- country_effect_with_gdp %>%
filter(name == "effect_50") %>%
rename(gdp = NY.GDP.PCAP.KD) %>%
group_by(country_code, country.x, gdp, region) %>%
point_interval(effect, .width = .5, .point = median, .interval = hdi) %>%
left_join(unis_p_country, by = c("country.x" = "country"))
labels <- pdata %>%
mutate(label = case_when(
country.x %in% c("China", "India", "Iran", "Germany", "United States",
"Brazil", "Luxembourg", "Czech Republic") ~ country.x,
TRUE ~ ""))
pdata %>%
ggplot(aes(gdp, effect, colour = region, label = "")) +
geom_linerange(aes(ymin = .lower, ymax = .upper, alpha = n_universities)) +
geom_point(aes(alpha = n_universities), size = 2) +
ggrepel::geom_text_repel(data = labels, aes(label = label),
show.legend = FALSE, max.overlaps = Inf,
box.padding = 1, min.segment.length = 0) +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::dollar) +
scale_color_discrete_qualitative(palette = "Dark 3") +
# scale_size_continuous(trans = "sqrt") +
scale_alpha_continuous(range = c(.2, 1), trans = "log10") +
labs(colour = "Region", x = "GDP per capita", alpha = "Number of universities",
y = expression(
paste("% increase in APC for 1% increase of ", P["top 10%"],
" at the median")
)) +
theme_clean() +
theme(legend.position = "top", legend.box = "vertical")
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
# coord_cartesian(ylim = c(0, .003))
contrast_50 %>%
group_by(country, drawid) %>%
summarise(intercept = mean(base)) %>%
ggplot(aes(intercept, fct_reorder(country, intercept))) +
stat_halfeye(.width = c(.5, .9), fill = colorspace::lighten("#007FA8"),
point_interval = "median_hdi") +
scale_x_continuous(labels = scales::dollar) +
theme_clean() +
labs(y = NULL, x = expression(paste("Estimated APC at median of ",
P["top 10%"])))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
Not sure whether this is helpful. What we show is quite abstract (APC at
hypothetical level of P_top10, averaged across all fields (weighting
equally).
The same would apply to the field intercepts above.